In [22]:

    
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
import datetime
%matplotlib inline
from shapely.prepared import prep
from shapely import speedups
speedups.enable()



In [ ]:

    
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
    counter += 1
    if (counter%100==0):
        print("%s Processing.. %s " % (datetime.datetime.now().time().isoformat(), counter))
    if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
        common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
        result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)









    



15:04:05.390324 Processing.. 100 
15:04:17.992654 Processing.. 200 
15:04:33.019456 Processing.. 300 
15:04:51.353167 Processing.. 400 
15:05:11.014846 Processing.. 500 
15:05:32.764214 Processing.. 600 
15:05:56.665404 Processing.. 700 
15:06:23.198995 Processing.. 800 
15:06:51.932699 Processing.. 900 
15:07:26.564767 Processing.. 1000 
15:08:03.810959 Processing.. 1100 
15:08:47.837072 Processing.. 1200 
15:09:31.719969 Processing.. 1300 
15:10:18.034893 Processing.. 1400 
15:11:12.971453 Processing.. 1500 
15:12:08.266557 Processing.. 1600 
15:13:10.106589 Processing.. 1700 
15:14:13.539186 Processing.. 1800 
15:15:17.949457 Processing.. 1900 
15:16:26.108047 Processing.. 2000 
15:17:40.069562 Processing.. 2100 
15:19:08.791963 Processing.. 2200 
15:20:36.307465 Processing.. 2300 
15:22:18.219172 Processing.. 2400 
15:23:52.060455 Processing.. 2500 
15:25:36.458143 Processing.. 2600 
15:27:14.736542 Processing.. 2700 
15:28:38.109453 Processing.. 2800 
15:30:07.359079 Processing.. 2900 
15:31:46.442218 Processing.. 3000 
15:33:21.544590 Processing.. 3100 
15:34:57.250393 Processing.. 3200 
15:36:48.141724 Processing.. 3300 
15:38:48.266840 Processing.. 3400 
15:41:12.213891 Processing.. 3500 
15:43:05.406919 Processing.. 3600 
15:45:11.116797 Processing.. 3700 
15:47:26.784023 Processing.. 3800 
15:49:44.004158 Processing.. 3900 
15:51:58.750333 Processing.. 4000 
15:54:21.594642 Processing.. 4100 
15:56:45.095902 Processing.. 4200 
15:59:17.855855 Processing.. 4300 
16:01:54.443593 Processing.. 4400

1. Collect and filter all observations

Recrods with latitude/longitude



In [ ]:

    
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]

How many unique species have occurrence records with latitude/longitude?



In [ ]:

    
result_with_lat_long['species'].unique().size

Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.



In [ ]:

    
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only

How many unique species HAVE records with latitude/longitude, AND date of event (at least year)



In [ ]:

    
result_with_lat_long['species'].unique().size

How many unique species with latitude/longitude, AND event date after 1990?



In [ ]:

    
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord', 'decimallatitude', 'decimallongitude']][(result_with_lat_long.year>1990) | (result_with_lat_long.eventdate>"1990")]

grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0]



In [ ]:

    
year_or_eventdate_1990.basisofrecord.unique()

I guess we should keep only observations of type 'OBSERVATION', 'MACHINE_OBSERVATION' and 'HUMAN_OBSERVATION'?



In [ ]:

    
final_selection = year_or_eventdate_1990[(year_or_eventdate_1990.basisofrecord=='OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='HUMAN_OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='MACHINE_OBSERVATION')]



In [ ]:

    
final_selection.species.unique().shape



In [ ]:

    
final_selection



In [ ]:

    
from iSDM.species import GBIFSpecies



In [ ]:

    
all_species = GBIFSpecies(name_species='All')



In [ ]:

    
all_species.set_data(final_selection)



In [ ]:

    
all_species.get_data().species.unique().shape # these many different species



In [ ]:

    
all_species.get_data().shape[0] # 1939675? this many observations satisfying our criteria (after 1990, with the correct observation type)



In [ ]:

    
year_or_eventdate_1990.shape[0] # before filtering out types of observations



In [ ]:

    
all_species.geometrize()



In [ ]:

    
all_species.get_data().species.unique().shape



In [ ]:

    
final_observations = all_species.get_data()[['species', 'year','eventdate', 'basisofrecord','geometry']]



In [ ]:

    
# final_observations.to_file("../data/bias_grid/final_observations", driver="ESRI Shapefile")

Read all filtered data.

To this point all the operations were to filter out data and finally store the remaining observations in a file. We can start with the stored file from here.



In [17]:

    
from geopandas import GeoDataFrame
final_observations = GeoDataFrame.from_file("../data/bias_grid/final_observations/")



In [18]:

    
final_observations.head()









    Out[18]:






  
    
      
      basisofrec
      eventdate
      geometry
      species
      year
    
  
  
    
      0
      HUMAN_OBSERVATION
      2007-01-29T23:00:00.000+0000
      POINT (30.1543 0.0581)
      Haplochromis elegans
      2007.0
    
    
      1
      HUMAN_OBSERVATION
      2007-01-27T23:00:00.000+0000
      POINT (29.9497 -0.1893)
      Haplochromis elegans
      2007.0
    
    
      2
      HUMAN_OBSERVATION
      2007-01-28T23:00:00.000+0000
      POINT (30.0797 0.0562)
      Haplochromis elegans
      2007.0
    
    
      3
      HUMAN_OBSERVATION
      2007-01-31T23:00:00.000+0000
      POINT (30.1871 -0.0805)
      Haplochromis elegans
      2007.0
    
    
      4
      HUMAN_OBSERVATION
      2006-11-21T23:00:00.000+0000
      POINT (30.1448 0.0567)
      Haplochromis elegans
      2006.0

2. Create a background grid at a resolution of 30arcsec

30 arcsec = 0.5/60 pixel = 0.0083333333 'height': 21600, 'width': 43200 for a global map

Generate 2D array and use it as a basis for bias grid.



In [19]:

    
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.083333333 # 0.0083333333
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)



In [20]:

    
x_res









    Out[20]:





4320



In [21]:

    
y_res









    Out[21]:





2160

We will use a memory-mapped biasgrid file to prevent the RAM from exploding. At least while results are computed.



In [23]:

    
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm_5arcmin.dat", dtype='int32', mode='w+', shape=(y_res,x_res))



In [24]:

    
def increase_pixel(row):
    bias_grid_mm[np.abs(int((row.y - 90) / pixel_size)),
              np.abs(int((row.x + 180) / pixel_size))]+=1



In [25]:

    
here = final_observations.geometry.apply(lambda row: increase_pixel(row)) # keeps the memory usage stable



In [26]:

    
bias_grid_mm.max()









    Out[26]:





memmap(42918, dtype=int32)



In [13]:

    
bias_grid_mm.std() # this still eats memory (uses intermediate datastructures, of course)









    Out[13]:





memmap(1.8395193632862339)



In [27]:

    
bias_grid_mm.sum()









    Out[27]:





memmap(1939676)



In [28]:

    
# check if the number of all observations is equal to the bias_grid sum of observations
bias_grid_mm.sum() == final_observations.shape[0]









    Out[28]:





memmap(True, dtype=bool)



In [29]:

    
bias_grid_mm.flush()



In [19]:

    
del bias_grid_mm



In [ ]:

    
# to read it, the following is used:



In [16]:

    
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='r', shape=(y_res,x_res))



In [5]:

    
bias_grid_mm.sum()









    Out[5]:





memmap(1939676)



In [6]:

    
import gc
gc.collect()









    Out[6]:





9

3. Store as a TIF raster.



In [30]:

    
import rasterio
from rasterio.transform import Affine
transform = Affine.translation(x_min, y_max) * Affine.scale(pixel_size, -pixel_size)
crs = {'init': "EPSG:4326"}



In [31]:

    
transform









    Out[31]:





Affine(0.083333333, 0.0, -180.0,
       0.0, -0.083333333, 90.0)



In [32]:

    
with rasterio.open("../data/bias_grid/bias_grid_5arcmin.tif", 'w', driver='GTiff', width=x_res, height=y_res,
                   count=1,
                   dtype=np.uint32,
                   nodata=0,
                   transform=transform,
                   crs=crs) as out:
    out.write(bias_grid_mm.astype(np.uint32), indexes=1)
    out.close()



In [33]:

    
bias_grid_mm.shape









    Out[33]:





(2160, 4320)



In [ ]:

	basisofrec	eventdate	geometry	species	year
0	HUMAN_OBSERVATION	2007-01-29T23:00:00.000+0000	POINT (30.1543 0.0581)	Haplochromis elegans	2007.0
1	HUMAN_OBSERVATION	2007-01-27T23:00:00.000+0000	POINT (29.9497 -0.1893)	Haplochromis elegans	2007.0
2	HUMAN_OBSERVATION	2007-01-28T23:00:00.000+0000	POINT (30.0797 0.0562)	Haplochromis elegans	2007.0
3	HUMAN_OBSERVATION	2007-01-31T23:00:00.000+0000	POINT (30.1871 -0.0805)	Haplochromis elegans	2007.0
4	HUMAN_OBSERVATION	2006-11-21T23:00:00.000+0000	POINT (30.1448 0.0567)	Haplochromis elegans	2006.0